library(dplyr)
library(ggplot2)
root = "./"
source(file.path(root,"MAIN.R"))
import_parameters(params)## **** __Utilized Cores__ **** = 2$subsetGenes
## [1] "protein_coding"
##
## $subsetCells
## [1] 500
##
## $resolution
## [1] 0.6
##
## $resultsPath
## [1] "./Results"
##
## $nCores
## [1] 2
##
## $perplexity
## [1] 30
Find genes of interest through graph-autocorrelation analysis.
find_gene_modules essentially runs UMAP on the genes (as opposed to the cells) and then groups them into modules using Louvain community analysis.gene_module_df = monocle3::find_gene_modules(cds[var.genes[1:1000], ],
# resolution=1e-2,
reduction_method = "UMAP",
max_components = 3,
cores=nCores,
umap.fast_sgd = F, # Slower but determinstic results
random_seed = 2019)
data.table::fwrite(gene_module_df, "./Results/sc.modules_by_gene.txt", sep="\t")
createDT(gene_module_df)# Create pheatmap table
cell_group_df = tibble::tibble(cell=row.names(SummarizedExperiment::colData(cds)),
cell_group=monocle3::clusters(cds)[colnames(cds)])
agg_mat = monocle3::aggregate_gene_expression(cds, gene_module_df, cell_group_df)
row.names(agg_mat) = stringr::str_c("Module ", row.names(agg_mat))
colnames(agg_mat) = stringr::str_c("Cluster ", colnames(agg_mat))
module_cluster_exp <- data.frame(as.matrix(agg_mat))
data.table::fwrite(data.table::data.table(module_cluster_exp, keep.rownames = "Module"),
"./Results/sc.modules_by_cluster.txt", sep="\t" )
# Plot
pheatmap::pheatmap(agg_mat, cluster_rows=T, cluster_cols=F,
scale="column", clustering_method="ward.D2",
fontsize=6) # plotly R documentation: https://plot.ly/r/
# rgl::plot3d(x = gene_module_df$dim_1, y=gene_module_df$dim_2, z=gene_module_df$dim_3, )
p.3d <- plotly::plot_ly(gene_module_df,
x = ~dim_1, y = ~dim_2, z = ~dim_3,
color = ~module,
text = ~paste("Gene:",id),
colors = "Dark2",
# alpha = .7,
size = 5
) %>%
plotly::layout(title = 'scRNA-seq Gene Co-expression Modules') %>%
plotly::add_markers() %>%
plotly::layout(scene = list(xaxis = list(title = 'UMAP 1'),
yaxis = list(title = 'UMAP 2'),
zaxis = list(title = 'UMAP 3')))
htmltools::tagList(setNames(list(p.3d), NULL)) # gene_module_df <-data.table::fread("./Results/sc.modules_by_gene.txt")
monocle3::plot_cells(cds,
genes=gene_module_df,
group_cells_by="cluster",
color_cells_by="cluster",
show_trajectory_graph=F) # rasterize = T (for super high-res figures)p2 <- monocle3::plot_cells(cds,
# sc.module 9 correponds to sc.module 5 in the old analyses
genes=subset(gene_module_df, module==9),
group_cells_by="cluster",
color_cells_by="cluster",
show_trajectory_graph=F)
print(p2)# Prepare bulk modules
bulk.modules <- data.table::fread("./Data/Katia_Modules/Gene_modules.tbl.txt")
# bulk.modules$ensembl.id <- gsub("\\.*","", bulk.modules$gene_ids)
# ensembl.dict <- ENSG_to_HGNC(ensembl_ids = unique(bulk.modules$gene_ids),
# reference_genome="grch38",
# input = "ensembl_transcript_id")
if(!file.exists("./Data/Katia_Modules/ens.geneid.gencode.v30")){
system("scp schilb03@chimera.hpc.mssm.edu:/sc/hydra/projects/pd-omics/GRCH38.p12/ens.geneid.gencode.v30 ./Data/Katia_Modules")
}
ensembl.dt <- data.table::fread("./Data/Katia_Modules/ens.geneid.gencode.v30")
ensembl.dict <- setNames(ensembl.dt$GeneSymbol, ensembl.dt$gene_id)
bulk.modules$gene.symbol <- ensembl.dict[bulk.modules$gene_ids]
bulk.modules <- bulk.modules %>% dplyr::rename(module=moduleColors)
createDT(bulk.modules)## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Prepare sc modules
sc.modules <- data.table::fread("./Results/sc.modules_by_gene.txt") %>% dplyr::rename(gene.symbol=id)
createDT(sc.modules)## [1] "gprofiler2:: Running enrichment on module: 4"
## [1] "gprofiler2:: Running enrichment on module: 15"
## [1] "gprofiler2:: Running enrichment on module: 7"
## [1] "gprofiler2:: Running enrichment on module: 3"
## [1] "gprofiler2:: Running enrichment on module: 13"
## [1] "gprofiler2:: Running enrichment on module: 5"
## [1] "gprofiler2:: Running enrichment on module: 8"
## [1] "gprofiler2:: Running enrichment on module: 2"
## [1] "gprofiler2:: Running enrichment on module: 14"
## [1] "gprofiler2:: Running enrichment on module: 9"
## [1] "gprofiler2:: Running enrichment on module: 11"
## [1] "gprofiler2:: Running enrichment on module: 12"
## [1] "gprofiler2:: Running enrichment on module: 16"
## [1] "gprofiler2:: Running enrichment on module: 17"
## [1] "gprofiler2:: Running enrichment on module: 1"
## [1] "gprofiler2:: Running enrichment on module: 6"
## [1] "gprofiler2:: Running enrichment on module: 10"
## [1] "gprofiler2:: 17 tested for ontological enrichment in 49.63 seconds"
Results Summary
- Bulk.modules differentially expressed between PD-Controls
+ Preferentially expressed in canonical monocytes + darkmagenta (particularly the cells furthest from intermediate monocytes, suggesting these cells are the least activated) + pink
+ honeydew1
+ Preferentially expressed in intermediate monocytes
+ purple (not clear separation but trending)
merged.mods <- data.table:::merge.data.table(bulk.modules %>% dplyr::select(bulk.module=module, gene.symbol),
sc.modules,
by= "gene.symbol",
all.y = T) %>%
dplyr::select(gene.symbol, module=bulk.module, supermodule, dim_1, dim_2, dim_3) %>% unique()
# Selected
select.modules <- c("lavenderblush3","darkturquoise","midnightblue","salmon4",
"honeydew1","pink","darkmagenta","green","blue","brown",
"darkolivegreen","darkorange2","purple","skyblue3",
"thistle1","thistle2") # sort(unique(merged.mods$module))[1:15]
p <- monocle3::plot_cells(cds,
genes=subset(merged.mods, module %in% select.modules),
group_cells_by="cluster",
color_cells_by="cluster",
show_trajectory_graph=F)
print(p)I also plotted all 60+ of the bulk.modules (ordered alphabetically) to see if any of them were preferentially expressed in a a given monocyte subtype. Darkmagenta remains one of the most striking examples, but there are other as well.
p2 <- monocle3::plot_cells(cds,
genes=merged.mods,
group_cells_by="cluster",
color_cells_by="cluster",
show_trajectory_graph=F)
print(p2)genomeSize <- length(union(unique(bulk.modules$gene.symbol), monocle3::fData(cds)$gene_short_name))
MOD.dt <- module_vs_module_enrichment(mod.df1 = bulk.modules,
mod.df2 = sc.modules,
genomeSize = genomeSize,
save_file="./Results/bulk.modules_vs_sc.modules.txt",
verbose = F) ## [1] "mod.df1 contains 66 unique modules."
## [1] "mod.df2 contains 17 unique modules."
## [1] "Conducting enrichment tests on 1122 module-module combinations..."
## [1] "1122 enrichment tests conducted in 1.52 seconds."
## [1] "24 enrichment tests were significant (at FDR ≤ 0.05)."
## [1] "22.73% of mod.df1 modules showed enrichment for a mod.df2 module."
## [1] "82.35% of mod.df2 modules showed enrichment for a mod.df1 module."
# library(heatmaply)
mod.cast <- reshape2::acast(MOD.dt,
Module1.name~Module2.name,
value.var="Jaccard.index")
hm <- heatmaply::heatmaply(mod.cast,
k_row = 5, k_col = 5,
height = 10,
column_text_angle = 0,
xlab = "scRNA-seq Louvain Modules",
ylab = "bulk RNA-seq WGCNA Modules",
cexRow = .7,
key.title = "Jaccard Index") # file = "module-module.heatmap.html"## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
mod.enrich.path <- "./Results/gprofiler2.module.enrichment.txt"
if(file.exists(mod.enrich.path)){
print(paste("Module GO Enrichment file detected. Importing...",mod.enrich.path))
mod.enrich <- data.table::fread(mod.enrich.path)
} else {
bulk.mod.enrich <- gprofiler2.module_enrichment(bulk.modules, dataset = "bulk-RNA-seq")
sc.mod.enrich <- gprofiler2.module_enrichment(sc.modules, dataset = "sc-RNA-seq")
# Merge
mod.enrich <- data.table::rbindlist(list(bulk.mod.enrich, sc.mod.enrich))
# Correct for multiple tests
mod.enrich <- mod.enrich %>% dplyr::mutate(FDR = p.adjust(p_value, method="fdr"),
Bonferroni = p.adjust(p_value, method="bonferroni")) %>% data.frame()
mod.enrich[mod.enrich == "NULL"] <- NA # is.na(mod.enrich) <- mod.enrich == "NULL"
data.table::fwrite(mod.enrich, mod.enrich.path, sep="\t")
}## [1] "Module GO Enrichment file detected. Importing... ./Results/gprofiler2.module.enrichment.txt"
mm.overlap <- module_vs_module_enrichment.terms(MOD.dt = MOD.dt,
mod.enrich = mod.enrich.sig,
top.terms = F,
row.limit = F)## [1] "Comparing overlap of enrichment terms for all module-module combinations."
# Prepare dataframe
jaccard.corr <- merge(x = MOD.dt[,c("Module1.name","Module2.name","Jaccard.index")] %>%
dplyr::rename(Jaccard.index.gene=Jaccard.index),
y = mm.overlap[,c("Module1.name","Module2.name","Jaccard.index",
"Module1.top.term","Module2.top.term")] %>%
dplyr::rename(Jaccard.index.term=Jaccard.index),
by=c("Module1.name","Module2.name"))
# Remove zeros
# jaccard.corr <- subset(jaccard.corr, Jaccard.index.gene > 0 & Jaccard.index.term > 0)
## Histogram
ggplot(jaccard.corr) +
geom_histogram(aes(Jaccard.index.gene, fill="Jaccard.index.gene"), alpha=.5) +
geom_histogram(aes(Jaccard.index.term, fill="Jaccard.index.term"), alpha=.5)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Correlation scatterplot
jaccard.corr <- jaccard.corr %>%
# label.dat <- subset(jaccard.corr, (Jaccard.index.gene >= .05| Jaccard.index.term >=.2) ) %>%
dplyr::mutate(Label=paste0("x: ",round(Jaccard.index.gene,2),
"\ny: ",round(Jaccard.index.term,2),
"\n\n",
# Mod 1
"Mod1 name: ",Module1.name,
"\nTop Term: ",Module1.top.term,
"\n\n",
# Mod 2
"Mod2 name: ",Module2.name,
"\nTop Term: ",Module2.top.term),
Jaccard.sum = Jaccard.index.gene + Jaccard.index.term)
corr.stat <- cor(jaccard.corr$Jaccard.index.gene, jaccard.corr$Jaccard.index.term, method="pearson")
# Scatterplot
sp <- ggplot(jaccard.corr, aes(x=Jaccard.index.gene, y=Jaccard.index.term, text=Label, color=Jaccard.sum)) +
geom_point() +
# ggrepel::geom_text_repel(data = label.dat, aes(label=Label)) +
geom_smooth(method=lm, se=T) +
labs(title = "Correlation between gene-level vs.term-level module-module Jaccard similarities",
subtitle = paste("Pearson's r =",round(corr.stat,3)))
ggp <- plotly::ggplotly(sp, tooltip ="text" )
htmltools::tagList(list(ggp)) # module-module comparisons
MOD.dt <- data.table::fread("./Results/bulk.modules_vs_sc.modules.txt") %>%
arrange(FDR)
top.bulk.mods <- head(MOD.dt, 10)$Module1.name %>% unique()
# One hub per module
bulk.hubs <- data.table::fread("./Data/Katia_Modules/hubs_gencode30.tbl.txt")
top.bulk.hubs <- subset(bulk.hubs, module %in% top.bulk.mods)
# Plot hub expression in sc data
monocle3::plot_cells(cds,
genes=top.bulk.hubs$symbol,
group_cells_by="cluster",
color_cells_by="cluster",
show_trajectory_graph=F) # rasterize = T (for super high-res figures)